setwd("C:/Users/horat/Desktop/CSIROIntership/soilCode")


library(dplyr)

#create pivot table 
library(reshape)
library(data.table)

#data partition seperate trainset and testset
library (caTools)

library(caret)

#svm library due to limitation of iterations change the library
library(e1071)
library(LiblineaR)

#random forest
library(randomForest)

#ID4 Decision Tree classifier(CART)
library(rpart)
library(rpart.plot)
library(rattle)

#xgboost
library(xgboost)

#for knn classification
library(class)

#install neuralnetwork
library(neuralnet)

#adabag library
library(adabag)

#Stochastic Gradient Descent (SGD) Method Learning Function
library(gradDescent)
library(lightgbm)
#https://www.kaggle.com/c/amazon-employee-access-challenge/discussion/5128#38925

#matrix library
library(Matrix)

#catboost
library(catboost)

#fast naive bayes
library("fastNaiveBayes")

#tidyverse for easy data manipulation and visualization
#caret for easy machine learning workflow

library(tidyverse)
library(caret)

featureSoilTable <- read.csv(file = "featureTable.csv",stringsAsFactors=FALSE)

Grouping data in a Pivot Table

print(head(featureSoilTable))

create the normalize function

normalize <- function(x){
  return (as.numeric((x-min(x))/(max(x)-min(x))))
}

preprocessing of the featuring table

#change the NULL to na
featureSoilTable['h_texture'][featureSoilTable['h_texture'] == "NULL"] <- NA
#add appendix to colname:
colnames(featureSoilTable) <- paste("Str",colnames(featureSoilTable),sep = "_")

remove invalid value and set NA value to 0

#extract valid and invalid soil sample
validsoilTexture <- featureSoilTable[!is.na(featureSoilTable$Str_h_texture),]
invalidsoilTexture <- featureSoilTable[is.na(featureSoilTable$Str_h_texture),]

#remove all columns with na
validsoilTexture <- validsoilTexture[,colSums(is.na(validsoilTexture))<nrow(validsoilTexture)]

#change null value to 0
validsoilTexture[is.na(validsoilTexture)] = 0

set x to numeric

validsoilTexture$Str_h_texture <- as.numeric(as.factor(validsoilTexture$Str_h_texture))
validsoilTexture <- apply(validsoilTexture, 2, as.factor)
validsoilTexture <- apply(validsoilTexture, 2, as.numeric)
validsoilTexture[,-1]<- (apply(validsoilTexture[,-1],2,normalize))

set random seed

set.seed(122)

give the valid sample

split = sample.split(validsoilTexture$Str_h_texture,SplitRatio = 0.7)

train_set = subset(validsoilTexture, split == TRUE)
test_set = subset(validsoilTexture, split == FALSE)

train_set$Str_h_texture = as.numeric(train_set$Str_h_texture)
test_set$Str_h_texture = as.numeric(test_set$Str_h_texture)
summary(train_set)
 Str_h_texture    Str_samp_no       Str_labr_no      Str_X1.40E.02      Str_X1.40E.04       Str_X1.80E.03        Str_X10_BC       
 Min.   : 1.00   Min.   :0.00000   Min.   :0.00000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:27.00   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X10A_NR          Str_X10A1           Str_X10B          Str_X10B_NR         Str_X10B1          Str_X10B3           Str_X10D1        
 Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
   Str_X11A1         Str_X12_HCL_CU     Str_X12_HCL_FE     Str_X12_HCL_MN     Str_X12_HCL_ZN     Str_X12_HF_CU      Str_X12_HF_FE     
 Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.002107   1st Qu.:0.000000  
 Str_X12_HF_MN       Str_X12_HF_ZN       Str_X12_NR_CU      Str_X12_NR_FE       Str_X12_NR_MN      Str_X12_NR_ZN       Str_X12_XRF_CU     
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Str_X12_XRF_FE      Str_X12_XRF_MN      Str_X12_XRF_ZN       Str_X12A1_CU        Str_X12A1_FE      Str_X12A1_MN       Str_X12A1_ZN      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X12B1_CU        Str_X12B1_ZN         Str_X12C1          Str_X12C2          Str_X13_C_FE       Str_X13_NR_AL       Str_X13_NR_FE     
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000e+00   Min.   :0.0000000   Min.   :0.00e+00  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00  
 Str_X13_NR_MN       Str_X13A1_AL       Str_X13A1_FE       Str_X13A1_MN        Str_X13A1_SI        Str_X13B1_AL        Str_X13B1_FE      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000e+00   Min.   :0.000e+00  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000e+00   1st Qu.:0.000e+00  
 Str_X13C_C_FE       Str_X13C1_AL        Str_X13C1_FE      Str_X13C1_FE203     Str_X13C1_MN       Str_X13C1_SI        Str_X14_NR_S      
 Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
    Str_X140          Str_X14B1           Str_X14C1          Str_X14D1_C        Str_X14D2_BC         Str_X14F1          Str_X14H1_CA      
 Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
  Str_X14H1_K         Str_X14H1_MG        Str_X14H1_NA      Str_X15_BASES     Str_X15_HSK_CEC      Str_X15_NR       Str_X15_NR_AL      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0e+00   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0e+00   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000  
 Str_X15_NR_BSa     Str_X15_NR_BSP     Str_X15_NR_CA      Str_X15_NR_CEC     Str_X15_NR_CMR     Str_X15_NR_ESP      Str_X15_NR_H      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X15_NR_K      Str_X15_NR_MG      Str_X15_NR_MN       Str_X15_NR_NA       Str_X15A1_CA      Str_X15A1_CEC       Str_X15A1_K    
 Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000  
  Str_X15A1_MG     Str_X15A1_MN       Str_X15A1_NA        Str_X15A2_CA      Str_X15A2_CEC       Str_X15A2_K        Str_X15A2_MG    
 Min.   :0.0000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000  
 1st Qu.:0.0000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000  
  Str_X15A2_NA       Str_X15A3_NA        Str_X15B1_CA       Str_X15B1_K        Str_X15B1_MG        Str_X15B1_NA       Str_X15B2_CA      
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
 Str_X15B2_CEC        Str_X15B2_K         Str_X15B2_MG        Str_X15B2_NA        Str_X15C1_CA      Str_X15C1_CEC      Str_X15C1_K      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
  Str_X15C1_MG       Str_X15C1_NA      Str_X15D1_AL       Str_X15D1_CA       Str_X15D1_CEC       Str_X15D1_K         Str_X15D1_MG      
 Min.   :0.000000   Min.   :0.00000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
  Str_X15D1_NA    Str_X15D2_CA       Str_X15D2_CEC        Str_X15D2_K         Str_X15D2_MG        Str_X15D2_NA        Str_X15E1_AL     
 Min.   :0e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000  
  Str_X15E1_CA      Str_X15E1_CEC       Str_X15E1_H         Str_X15E1_K         Str_X15E1_MG       Str_X15E1_MN       Str_X15E1_NA      
 Min.   :0.000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X15E2_CA  Str_X15E2_K  Str_X15E2_MG  Str_X15E2_NA  Str_X15F1_CA      Str_X15F1_CEC       Str_X15F1_K        Str_X15F1_MG     
 Min.   :0     Min.   :0    Min.   :0     Min.   :0     Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0     1st Qu.:0    1st Qu.:0     1st Qu.:0     1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
  Str_X15F1_NA        Str_X15F2          Str_X15F2_AL         Str_X15F3          Str_X15F4           Str_X15G_C       Str_X15G_C_AL1    
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_X15G_C_AL2      Str_X15G_C_H1        Str_X15G_H         Str_X15G1          Str_X15G1_AL        Str_X15G1_H         Str_X15I3        
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
   Str_X15I4        Str_X15J_BASES      Str_X15J_C          Str_X15J_H        Str_X15J1          Str_X15L1        Str_X15L1_a      
 Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000  
   Str_X15N1         Str_X15N1_a        Str_X15N1_b        Str_X17A_HF.        Str_X17A_NR         Str_X17A1          Str_X18_NR       
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000e+00   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000e+00   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X18_NR_K        Str_X18A1         Str_X18A1_NR         Str_X18B1           Str_X18B2          Str_X18F1_AL       Str_X18F1_AS      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_X18F1_B         Str_X18F1_CA       Str_X18F1_CD       Str_X18F1_CO       Str_X18F1_CU        Str_X18F1_FE       Str_X18F1_K      
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000  
  Str_X18F1_MG       Str_X18F1_MN       Str_X18F1_MO       Str_X18F1_NA        Str_X18F1_NI        Str_X18F1_P         Str_X18F1_PB     
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000  
  Str_X18F1_S        Str_X18F1_SE       Str_X18F1_ZN       Str_X19_COL        Str_X19A1         Str_X19B_NR         Str_X19B1       
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0e+00   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0e+00   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_X2.00E.01    Str_X2.00E.02       Str_X2_LOI          Str_X2A1          Str_X2D1          Str_X2Z1_R1       Str_X2Z1_R2      
 Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000  
   Str_X2Z2_C      Str_X2Z2_CLAY       Str_X2Z2_CS       Str_X2Z2_FS        Str_X2Z2_S        Str_X2Z2_Z         Str_X3_C_B      
 Min.   :0.00000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
   Str_X3_NR        Str_X3A_C_2.5        Str_X3A_TSS           Str_X3A1           Str_X4_NR        Str_X4A_C_1       Str_X4A_C_2.5    
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
    Str_X4A1         Str_X4B_AL        Str_X4B_AL_NR      Str_X4B_C_2.5          Str_X4B1         Str_X4B2       Str_X4C_C_1      
 Min.   :0.00000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000000  
 1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000000  
    Str_X4C1          Str_X4G_NR        Str_X5_C_B        Str_X5_NR         Str_X5A_C_2.5         Str_X5A_NR           Str_X5A1        
 Min.   :0.000000   Min.   :0.0e+00   Min.   :0.00000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.0e+00   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
    Str_X5A2           Str_X6_DC            Str_X6A1          Str_X6A1_UC          Str_X6B1            Str_X6B2          Str_X6B3       
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000  
    Str_X6Z            Str_X7_C_B          Str_X7_NR            Str_X7A1           Str_X7A2          Str_X7A2a            Str_X7A5       
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000  
    Str_X7B1         Str_X7C_CASO4          Str_X7C1          Str_X7C1a           Str_X7C1b           Str_X7C1d           Str_X7C1e        
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
    Str_X8A1         Str_X9.00E.02        Str_X9_E_NR          Str_X9_NR          Str_X9A_HCL        Str_X9A_HCLP2O5      Str_X9A_HF.      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00  
   Str_X9A_NR         Str_X9A_S14           Str_X9A1           Str_X9A3          Str_X9A3a           Str_X9B_9C          Str_X9B_NR      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000  
    Str_X9B1            Str_X9B2          Str_X9B2_COL        Str_X9BUFF_0       Str_X9BUFF_0.5       Str_X9BUFF_1        Str_X9BUFF_2      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
  Str_X9BUFF_4          Str_X9C2           Str_X9D2            Str_X9E          Str_X9G_BSES          Str_X9G1            Str_X9G2        
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
   Str_X9H_NR    Str_X9H1           Str_X9I1           Str_X9J2           Str_X9R1            Str_M1a    Str_MIN_EC       Str_MIN_NR_K2O    
 Min.   :0    Min.   :0.000000   Min.   :0.001248   Min.   :0.000000   Min.   :0.0000000   Min.   :0   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0    1st Qu.:0.001597   1st Qu.:0.001248   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0   1st Qu.:0.000000   1st Qu.:0.000000  
  Str_P10_1m2m      Str_P10_20_100     Str_P10_20_75      Str_P10_20_75a      Str_P10_75_106      Str_P10_CF_C     Str_P10_CF_CS    
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000  
 Str_P10_CF_FS     Str_P10_CF_S       Str_P10_CF_Z      Str_P10_GRAV      Str_P10_gt2m      Str_P10_gt2MI      Str_P10_gt2OM     
 Min.   :0.0000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.00e+00  
 1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00  
 Str_P10_HYD_C      Str_P10_HYD_CS     Str_P10_HYD_FS     Str_P10_HYD_Z       Str_P10_NR_C     Str_P10_NR_CS     Str_P10_NR_FS    
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
  Str_P10_NR_S     Str_P10_NR_Saa     Str_P10_NR_Z     Str_P10_NR_ZC       Str_P10_PB_C     Str_P10_PB_CS     Str_P10_PB_FS    
 Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00e+00   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00e+00   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
  Str_P10_PB_S       Str_P10_PB_Z     Str_P10_PB1_C       Str_P10_PB1_CS      Str_P10_PB1_FS      Str_P10_PB1_Z       Str_P10_S_0.20    
 Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000  
 Str_P10_S_0.48      Str_P10_S_1       Str_P10_S_1000     Str_P10_S_125      Str_P10_S_15.6      Str_P10_S_2        Str_P10_S_20     
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_P10_S_2000     Str_P10_S_250       Str_P10_S_3.9      Str_P10_S_31.2      Str_P10_S_500       Str_P10_S_53      Str_P10_S_63      
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000  
 Str_P10_S_7.8      Str_P10100_200      Str_P10106_150    Str_P10150_180     Str_P10180_300     Str_P10200_500      Str_P10200_600     
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Str_P102002000      Str_P10300_600     Str_P105002000      Str_P106001000     Str_P106002000       Str_P10A1_C        Str_P10A1_CS     
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000  
  Str_P10A1_FS       Str_P10A1_Z         Str_P3A_NR          Str_P3A1        Str_P3A1_C4       Str_P3A1_CLOD        Str_P3A1_e      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00e+00  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00e+00  
  Str_P3A2_McK       Str_P3A2_McKMP      Str_P3B_GV_01      Str_P3B_GV_03       Str_P3B_GV_15       Str_P3B_NR_005     Str_P3B_NR_01      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000  
 Str_P3B_NR_15      Str_P3B_VL_01      Str_P3B_VL_15      Str_P3B1GV_15        Str_P3B1VL_1       Str_P3B1VL_15        Str_P3B2GV_1     
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00  
 Str_P3B2GV_15      Str_P3B2GV_5      Str_P3B2VL_03        Str_P3B2VL_1      Str_P3B2VL_15       Str_P3B2VL_5      Str_P3B3VLa001    
 Min.   :0.0e+00   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0e+00   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_P3B3VLa005     Str_P3B3VLa01       Str_P3B3VLa03      Str_P3B3VLa06      Str_P3B3VLaSAT    Str_P3B3VLb001     Str_P3B3VLb003    
 Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_P3B3VLb005     Str_P3B3VLb01      Str_P3B3VLb03      Str_P3B3VLb05      Str_P3B3VLb06     Str_P3B3VLbSAT     Str_P3B3VLc001    
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_P3B3VLc003     Str_P3B3VLc005     Str_P3B3VLc01      Str_P3B3VLc03     Str_P3B3VLc06       Str_P3B3VLcSAT     Str_P3B3VLd06      
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
  Str_P3B3VLd1       Str_P3B3VLd15      Str_P3B3VLd3        Str_P3B3VLd5       Str_P3B3VLe004     Str_P3B3VLe01       Str_P3B3VLe03      
 Min.   :0.0000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Str_P3B3VLe06       Str_P3B3VLe15        Str_P3B3VLe2        Str_P3B3VLe7       Str_P3B4GV_01      Str_P3B4VL_005     Str_P3B5GV_01     
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
 Str_P4_100DMcK      Str_P4_10DMcK       Str_P4_30_LOV       Str_P4_30DMcK      Str_P4_50_McK       Str_P4_50DMcK         Str_P4_sat      
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00  
 Str_P4_sat_FH       Str_P4_sat_For      Str_P4_sat_LOV      Str_P4_sat_McK       Str_P5_COLE      Str_P5_LS_MOD         Str_P6_LP       
 Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.0000000   Min.   :0.000000  
 1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.0000000   1st Qu.:0.000000  
  Str_PWS1.2mm       Str_PWS20.63      Str_PWS212.425     Str_PWS425.1mm    Str_PWS63.212       Str_TE_NR_AL       Str_TE_NR_AL2O     
 Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
  Str_TE_NR_CA      Str_TE_NR_FE20       Str_TE_NR_MG        Str_TE_NR_NA      Str_TE_NR_SI02     Str_TE_NR_TI02      Str_TE_XRF_MG      
 Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000  
  Str_TE_XRFAL       Str_TE_XRFCA        Str_TE_XRFNA      Str_TE_XRFSI02      Str_TE_XRFTIO2     Str_XRD_C_Amp       Str_XRD_C_An      
 Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.0000000  
 Str_XRD_C_Bhm       Str_XRD_C_Bt       Str_XRD_C_Cal      Str_XRD_C_Ch2       Str_XRD_C_Chl      Str_XRD_C_Fsp      Str_XRD_C_Gbs      
 Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.0000000  
 Str_XRD_C_Gth       Str_XRD_C_Hem      Str_XRD_C_Ht0       Str_XRD_C_Ilt       Str_XRD_C_Is    Str_XRD_C_K2O        Str_XRD_C_Ka    
 Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000   Min.   :0.0000000   Min.   :0.00000  
 1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000000   1st Qu.:0.00000  
 Str_XRD_C_Kln        Str_XRD_C_Lp      Str_XRD_C_Mag      Str_XRD_C_Mca      Str_XRD_C_Mgh       Str_XRD_C_Mnt        Str_XRD_C_Ms      
 Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.0000000  
 1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.0000000  
 Str_XRD_C_Plg      Str_XRD_C_Plm       Str_XRD_C_Qz       Str_XRD_C_Rt      Str_XRD_C_Sme        Str_XRD_C_Tc      Str_XRD_C_Vrm      
 Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.000000   Min.   :0.00e+00   Min.   :0.0000000   Min.   :0.00e+00   Min.   :0.0000000  
 1st Qu.:0.00e+00   1st Qu.:0.00e+00   1st Qu.:0.000000   1st Qu.:0.00e+00   1st Qu.:0.0000000   1st Qu.:0.00e+00   1st Qu.:0.0000000  
 [ reached getOption("max.print") -- omitted 4 rows ]
# Find the best model with the best cost parameter via 10-fold cross-validations

# the tunning part of svm, which will take lots of time to run

tryTypes=c(0:7)
tryCosts=c(1000,1,0.001)
bestCost=NA
bestAcc=0.6290723
bestType=NA

for(ty in tryTypes){

   for(co in tryCosts){
    acc=LiblineaR(data=train_set[,-1],target=train_set[,c("Str_h_texture")],type=7,cost=co,bias=1,verbose=FALSE)
    cat("Results for C=",co," : ",acc," accuracy.\n",sep="")
    if(acc>bestAcc){
    bestCost=co
    bestAcc=acc
    bestType=ty
    }
  }

}

svm classifier

LIBLINEAR is a linear classifier for data with millions of instances and features. It supports L2-regularized classifiers, L2-loss linear SVM, L1-loss linear SVM, and logistic regression (LR).LiblineaR allows the estimation of predictive linear models for classification and regression, such as L1- or L2-regularized logistic regression, L1- or L2-regularized L2-loss support vector classification, L2-regularized L1-loss support vector classification and multi-class support vector classification. It also supports L2-regularized support vector regression (with L1- or L2-loss). The estimation of the models is particularly fast as compared to other libraries.

svmStarttime <- Sys.time()
svmClassifier <- LiblineaR(data = train_set[,-1],target = train_set[,c("Str_h_texture")],bias=1,cost = 1000)
svmPredictTrain <- predict(svmClassifier,train_set[,-1],proba=TRUE,decisionValues=TRUE)
svmPredictTrainTable <- table(svmPredictTrain$predictions,train_set[,c("Str_h_texture")])

function for calculating the score of matirx

sumElementinTable <- function(a,c,r){
  sum = 0
  for (i in c){
    if (i %in% r){
      sum = sum + a[i,i]
    }
  }
  return(sum)
}

calculating the score of svmClassifier


svmTestcol <- colnames(svmPredictTestTable)
svmTestrow <- rownames(svmPredictTestTable)

svmTraincol <- colnames(svmPredictTrainTable)
svmTrainrow <- rownames(svmPredictTrainTable)


svmPredictTestScore <- sumElementinTable(svmPredictTestTable,svmTestcol,svmTestrow)/sum(svmPredictTestTable)
svmPredictTrainScore <- sumElementinTable(svmPredictTrainTable,svmTraincol,svmTrainrow)/sum(svmPredictTrainTable)
# the time of svm is:
cat("the running time of svm is",svmTimeTaken)
the running time of svm is 50.21468
#the score of svm is

cat("the train score of svm algorithm is ",svmPredictTrainScore,'\n')
the train score of svm algorithm is  0.3311756 
cat("the test score of svm algorithm is ",svmPredictTestScore)
the test score of svm algorithm is  0.3023567

classification is CART model

cartFit <- rpart(Str_h_texture ~ .,data = train_set,control = rpart.control(cp = 0.0001))

#get cp value
printcp(cartFit)

choose the CP with lowest xerror

cartstartTime <- Sys.time()

fit.pruned = prune(cartFit, cp = 0.00020393)

cartPrediction <- predict(fit.pruned, test_set, type = "vector")

cartendTime <- Sys.time()

cartTimeTaken <- cartendTime - cartendTime

data.frame(test_set,cartPrediction)

cartPrediction = round(cartPrediction,0)
cartTable <- table(test_set$Str_h_texture,cartPrediction)

cartTable
    cartPrediction
        5    6    8    9   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33
  1     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  3     1    0    0    0    0    2    1    0    0    0    0    0    1    0    0    1    3    2    0    1    1    3    0    0    0    1    1
  4     0    0    0    0    0    0    0    0    0    0    4    0    2    2    0    0    2    3    0    0   12    0    0    0    2    4    1
  5     0    1    4    0    6   50    4    5   22   23   18   13    3   16    2   18   12   18    0   11   24   15    8   52    1   21   30
  6     0    0    0    0    0    0    1    1    3   10    0    0    0    0    0    0   11    8    0    4    0    0    3    8    0    0    4
  7     0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    1    2    0    0    1    2    0
  8     0    0    0    0    0    0    1    0    1   10    4    0    0    1    0    3    3    0    0    0    3    3    0    2    0    8    4
  9     0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  10    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  11    0    0    0    1    0    5    0    1    3    0    9    8    4    3    0    0    5   10    5    1   14    4    3    1    0   16    8
  12    0    0    0    0    0    3    0    0    0    0    2    0    2    0    0    1    7    3    0    2   12    3    1    1    1   15    3
  13    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  14    0    0    0    1    0    2    0    0   10   13    3    0    2    6    1    3   30    4    0    3    8    6    3    2    1    6   14
  15    0    0    0    0    0   10    1    0    0    6    5    1    5    8    1    2   52   22    1   19   12    5    4   19    4    8   11
  16    0    0    0    0    0    0    0    0    0    4    0    0    0    1    0    0    0    1    0    0    3    0    3    3    0    1    5
  17    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  18    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    6    0    0    0    0    0    7    2
  19    0    0    0    0    0    0    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0
    cartPrediction
       34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   61   64
  1     0    0    0    1    4    0    0    0    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0
  3     0    3    3    0   27    0    0    1    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  4     0    4    3    0   49    9    5    2    1    4    0    1    0    0    0    0    1    1    2    0    0    0    0    0    0    0
  5    26   31   12   64  452   38   19   21   31    6   12    2    5    5    3   17    4    3    0    4    0    5    3    0    1    1
  6     0    3    2    6   59   10    5    4    3    0    0    1    0    1    1    0    0    0    0    0    0    0    0    0    0    0
  7     0    0    1    0    7    0    4    1    1    0    0    0    0    1    0    4    0    1    0    0    0    0    0    0    0    0
  8     2    7    6    3  124   12    7    1    3    2    2    3    1    2    2    4    7    3    2    0    0    2    0    4    0    0
  9     0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  10    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  11    1    5   15   14  233   19   21    5   18   10    9   17    5   10    5    6    8    3    2    0    0    3    0    0    0    0
  12    0    8    4    4  102    2    3    1    2    8    1    1    0    7   12    2    1    0    0    0    0    0    0    0    0    0
  13    0    0    0    0    6    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  14    3    5    2    6  116   12    4    4    3    2    6    0    1    0    0    0    0    0    2    1    1    0    0    0    0    0
  15    3   15    4    4  178   21   10    7    7    4    0    2    3    3    1    3    0    3    0    1    0    0    0    0    0    0
  16    0    2    0    3   36    4    0   11    0    0    3    0    0    1    2    0    0    0    1    0    0    0    1    0    0    0
  17    0    0    0    0    9    0    0    2    0    0    0    0    0    0    0    0    0    0    2    0    0    0    0    0    0    0
  18    0    0    0   13   29    6    3    3    2    0    0    1    3    0    1    3    0    0    0    0    0    0    0    0    0    0
  19    0    0    0    0    1    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
 [ reached getOption("max.print") -- omitted 45 rows ]

calculate the score of cart model

cartrow <- rownames(cartTable)
cartcol <- colnames(cartTable)
cartscore <- sumElementinTable(cartTable,cartrow,cartcol)/sum(cartTable)

the time of cart model

cat("the time of cart",cartTimeTaken)
the time of cart 0

the score of cart model

cat('the score of cart model',cartscore)
the score of cart model 0.02074463

lightgbm

separate x and y from train_set and test_set


train_set.num_X <- select (train_set,-c(Str_h_texture))
test_set.num_X <- select (test_set,-c(Str_h_texture))

start lightgbm machine learning algorithms

lstarttime <- Sys.time()
ltrain = lgb.Dataset(data = as.matrix(train_set.num_X),label = train_set$Str_h_texture, free_raw_data = FALSE)
params <- list(objective="regression", metric="l2")
model <- lgb.cv(params, 
                ltrain , 
                10, 
                nfold=5, 
                min_data=1, 
                learning_rate=1, 
                early_stopping_rounds=10,
                Depth = 8,
                lambda_l1 = 10,
                lambda_l2 = 10
)
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037081 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37694, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37693, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38581
[LightGBM] [Info] Number of data points in the train set: 37693, number of used features: 468
[LightGBM] [Warning] Unknown parameter: Depth
[LightGBM] [Info] Start training from score 37.875524
[LightGBM] [Info] Start training from score 37.803815
[LightGBM] [Info] Start training from score 37.814639
[LightGBM] [Info] Start training from score 37.763510
[LightGBM] [Info] Start training from score 37.848672
[1]:    valid's l2:255.554+1.93548 
[2]:    valid's l2:252.462+2.45908 
[3]:    valid's l2:251.348+2.5486 
[4]:    valid's l2:250.173+2.01428 
[5]:    valid's l2:248.854+2.47545 
[6]:    valid's l2:248.224+2.60872 
[7]:    valid's l2:248.128+2.67808 
[8]:    valid's l2:247.48+2.80606 
[9]:    valid's l2:247.181+3.28257 
[10]:   valid's l2:247.176+3.60095 
lstoptime <- Sys.time()

tunning parameters

num_leaves: This is the main parameter to control the complexity of the tree model. Theoretically, we can set num_leaves = 2^(max_depth) to obtain the same number of leaves as depth-wise tree. However, this simple conversion is not good in practice. The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. Thus, when trying to tune the num_leaves, we should let it be smaller than 2^(max_depth). For example, when the max_depth=7 the depth-wise tree can get good accuracy, but setting num_leaves to 127 may cause over-fitting, and setting it to 70 or 80 may get better accuracy than depth-wise.

min_data_in_leaf: This is a very important parameter to prevent over-fitting in a leaf-wise tree. Its optimal value depends on the number of training samples and num_leaves. Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset.

max_depth: You also can use max_depth to limit the tree depth explicitly.

ltest = lgb.Dataset.create.valid(ltrain , as.matrix(test_set.num_X), label = test_set$Str_h_texture)
valids <- list(test = ltest)

grid_search <- expand.grid(Depth = 1:8,
                           L1 = 8:16,
                           L2 = 8:16)

model <- list()
perf <- numeric(nrow(grid_search))

for (i in 1:nrow(grid_search)) {
  model[[i]] <- lgb.train(list(objective = "regression",
                               metric = "l2",
                               lambda_l1 = grid_search[i, "L1"],
                               lambda_l2 = grid_search[i, "L2"],
                               max_depth = grid_search[i, "Depth"]),
                          ltrain,
                          2,
                          valids,
                          min_data = 1,
                          learning_rate = 1,
                          early_stopping_rounds = 5,
                          num_leaves = 2,
                          num_iterations = 1000,
                          min_gain_to_split = 500,)
  
  perf[i] <- min(rbindlist(model[[i]]$record_evals$test$l2))
}

cat("Model ", which.min(perf), " is lowest loss: ", min(perf), sep = "")

print(grid_search[which.min(perf), ])

Algorithms score is around 0.38 and computational time is:

lgbtaketime <- lstoptime - lstarttime
cat("The algorithms takes ", lgbtaketime, "seconds")
The algorithms takes  2.367668 seconds

catboost


catstartTime <- Sys.time()

fit_params <- list(l2_leaf_reg = 0.001,
                   depth=6,
                   learning_rate = 0.1,
                   iterations = 100,
                   random_seed = 233)


pool = catboost.load_pool(as.matrix(train_set.num_X), label = as.integer(train_set[,1]))

model <- catboost.train(pool, params = fit_params)
0:  learn: 16.5371834   total: 32.2ms   remaining: 3.19s
1:  learn: 16.4819241   total: 69.2ms   remaining: 3.39s
2:  learn: 16.4321274   total: 106ms    remaining: 3.42s
3:  learn: 16.3910531   total: 138ms    remaining: 3.32s
4:  learn: 16.3533938   total: 168ms    remaining: 3.19s
5:  learn: 16.3204430   total: 203ms    remaining: 3.18s
6:  learn: 16.2939862   total: 234ms    remaining: 3.11s
7:  learn: 16.2683750   total: 271ms    remaining: 3.11s
8:  learn: 16.2458727   total: 302ms    remaining: 3.05s
9:  learn: 16.2231423   total: 341ms    remaining: 3.07s
10: learn: 16.2049919   total: 375ms    remaining: 3.03s
11: learn: 16.1861848   total: 409ms    remaining: 3s
12: learn: 16.1700804   total: 443ms    remaining: 2.97s
13: learn: 16.1504023   total: 482ms    remaining: 2.96s
14: learn: 16.1353299   total: 518ms    remaining: 2.94s
15: learn: 16.1186926   total: 563ms    remaining: 2.96s
16: learn: 16.1051210   total: 602ms    remaining: 2.94s
17: learn: 16.0876521   total: 638ms    remaining: 2.9s
18: learn: 16.0711332   total: 668ms    remaining: 2.85s
19: learn: 16.0593029   total: 700ms    remaining: 2.8s
20: learn: 16.0460410   total: 736ms    remaining: 2.77s
21: learn: 16.0358604   total: 774ms    remaining: 2.74s
22: learn: 16.0280227   total: 816ms    remaining: 2.73s
23: learn: 16.0157761   total: 855ms    remaining: 2.71s
24: learn: 16.0087857   total: 887ms    remaining: 2.66s
25: learn: 15.9957831   total: 920ms    remaining: 2.62s
26: learn: 15.9863858   total: 957ms    remaining: 2.59s
27: learn: 15.9777166   total: 987ms    remaining: 2.54s
28: learn: 15.9688117   total: 1.02s    remaining: 2.51s
29: learn: 15.9601138   total: 1.06s    remaining: 2.47s
30: learn: 15.9527663   total: 1.1s remaining: 2.44s
31: learn: 15.9453010   total: 1.13s    remaining: 2.4s
32: learn: 15.9372778   total: 1.16s    remaining: 2.36s
33: learn: 15.9286856   total: 1.19s    remaining: 2.31s
34: learn: 15.9228070   total: 1.22s    remaining: 2.27s
35: learn: 15.9157059   total: 1.26s    remaining: 2.23s
36: learn: 15.9106635   total: 1.29s    remaining: 2.21s
37: learn: 15.9037768   total: 1.33s    remaining: 2.17s
38: learn: 15.8939276   total: 1.37s    remaining: 2.14s
39: learn: 15.8890469   total: 1.41s    remaining: 2.11s
40: learn: 15.8833173   total: 1.44s    remaining: 2.08s
41: learn: 15.8744792   total: 1.48s    remaining: 2.04s
42: learn: 15.8702257   total: 1.51s    remaining: 2s
43: learn: 15.8647871   total: 1.54s    remaining: 1.96s
44: learn: 15.8575401   total: 1.57s    remaining: 1.93s
45: learn: 15.8539751   total: 1.61s    remaining: 1.89s
46: learn: 15.8486778   total: 1.64s    remaining: 1.85s
47: learn: 15.8444692   total: 1.68s    remaining: 1.81s
48: learn: 15.8384841   total: 1.71s    remaining: 1.78s
49: learn: 15.8318644   total: 1.75s    remaining: 1.75s
50: learn: 15.8211050   total: 1.78s    remaining: 1.71s
51: learn: 15.8171272   total: 1.81s    remaining: 1.67s
52: learn: 15.8131695   total: 1.84s    remaining: 1.63s
53: learn: 15.8069144   total: 1.88s    remaining: 1.6s
54: learn: 15.8029788   total: 1.91s    remaining: 1.56s
55: learn: 15.7989711   total: 1.94s    remaining: 1.53s
56: learn: 15.7945593   total: 1.98s    remaining: 1.49s
57: learn: 15.7905434   total: 2.01s    remaining: 1.45s
58: learn: 15.7863749   total: 2.04s    remaining: 1.42s
59: learn: 15.7833903   total: 2.08s    remaining: 1.39s
60: learn: 15.7762922   total: 2.12s    remaining: 1.35s
61: learn: 15.7700376   total: 2.15s    remaining: 1.32s
62: learn: 15.7651458   total: 2.19s    remaining: 1.28s
63: learn: 15.7622770   total: 2.22s    remaining: 1.25s
64: learn: 15.7581282   total: 2.25s    remaining: 1.21s
65: learn: 15.7534077   total: 2.28s    remaining: 1.18s
66: learn: 15.7509048   total: 2.32s    remaining: 1.14s
67: learn: 15.7476133   total: 2.36s    remaining: 1.11s
68: learn: 15.7446307   total: 2.39s    remaining: 1.07s
69: learn: 15.7414847   total: 2.43s    remaining: 1.04s
70: learn: 15.7370984   total: 2.46s    remaining: 1s
71: learn: 15.7333530   total: 2.49s    remaining: 969ms
72: learn: 15.7303575   total: 2.53s    remaining: 935ms
73: learn: 15.7274914   total: 2.56s    remaining: 898ms
74: learn: 15.7233322   total: 2.59s    remaining: 864ms
75: learn: 15.7191933   total: 2.63s    remaining: 829ms
76: learn: 15.7132419   total: 2.66s    remaining: 795ms
77: learn: 15.7114110   total: 2.69s    remaining: 760ms
78: learn: 15.7090804   total: 2.73s    remaining: 726ms
79: learn: 15.7052953   total: 2.77s    remaining: 692ms
80: learn: 15.7020065   total: 2.8s remaining: 657ms
81: learn: 15.6988539   total: 2.83s    remaining: 622ms
82: learn: 15.6959701   total: 2.87s    remaining: 588ms
83: learn: 15.6938343   total: 2.91s    remaining: 554ms
84: learn: 15.6901454   total: 2.94s    remaining: 520ms
85: learn: 15.6821179   total: 2.99s    remaining: 486ms
86: learn: 15.6784857   total: 3.03s    remaining: 452ms
87: learn: 15.6749618   total: 3.06s    remaining: 418ms
88: learn: 15.6674241   total: 3.09s    remaining: 382ms
89: learn: 15.6639432   total: 3.13s    remaining: 348ms
90: learn: 15.6576915   total: 3.17s    remaining: 314ms
91: learn: 15.6549104   total: 3.21s    remaining: 279ms
92: learn: 15.6483326   total: 3.24s    remaining: 244ms
93: learn: 15.6451033   total: 3.28s    remaining: 209ms
94: learn: 15.6419415   total: 3.31s    remaining: 174ms
95: learn: 15.6385591   total: 3.35s    remaining: 140ms
96: learn: 15.6362077   total: 3.39s    remaining: 105ms
97: learn: 15.6324720   total: 3.42s    remaining: 69.8ms
98: learn: 15.6281495   total: 3.45s    remaining: 34.9ms
99: learn: 15.6249271   total: 3.49s    remaining: 0us
catstopTime <- Sys.time()

cattakenTime <- catstopTime - catstartTime

calculate the prediction:

#get the prediction
catprediction <- catboost.predict(model, 
                                  pool, 
                                  prediction_type = 'RawFormulaVal')

calculate the program score:

#round the prediction
catprediction <- round(catprediction,0)

catTable <- table(train_set$Str_h_texture,catprediction)

catTablerow <- rownames(catTable)
catTablecol <- colnames(catTable)
catscore <- sumElementinTable(catTable,catTablerow,catTablecol)/sum(catTable)
cat('The algorithm takes' ,cattakenTime , 'seconds')
The algorithm takes 4.16187 seconds
cat('The algorithm scores' ,catscore)
The algorithm scores 0.02073562

naivebayes classification*


nbstarttime <- Sys.time()
  
nbClassifier <- naiveBayes(as.factor(Str_h_texture) ~ .,data = train_set,laplace=2)
nbTestPrediction <- predict(nbClassifier,test_set,type = "class")
nbTableTest <- table(nbTestPrediction,test_set$Str_h_texture)

nbTestTablerow <- rownames(nbTableTest)
nbTestTablecol <- colnames(nbTableTest)
nbTestTablescore<- sumElementinTable(nbTableTest,nbTestTablerow,nbTestTablecol)/sum(nbTableTest)

nbendtime <- Sys.time()

nbTrainPrediction <- predict(nbClassifier,train_set,type = "class")
nbTrainTable <- table(nbTrainPrediction,train_set$Str_h_texture)

nbTrainTablerow <- rownames(nbTrainTable)
nbTrainTablecol <- colnames(nbTrainTable)
nbTrainTablescore <- sumElementinTable(nbTrainTable,nbTrainTablerow,nbTrainTablecol)/sum(nbTrainTable)

nbtakentime <- nbendtime - nbstarttime

nbalgorithm

cat('NaiveBayes takes',nbtakentime,'seconds')
NaiveBayes takes 4.305502 seconds

nbscore

cat('NaiveBayes score',nbTrainTablescore)
NaiveBayes score 355

fastNaiveBayes algorithms by gaussian

fnbstartTime <- Sys.time()
dist <- fnb.detect_distribution(train_set.num_X)
gauss <- fnb.gaussian(train_set.num_X[,dist$gaussian], as.factor(train_set$Str_h_texture),sparse = TRUE,check = FALSE)
pred <- predict(gauss, train_set.num_X[,dist$gaussian])
fnbendTime <- Sys.time()
error <- mean(as.factor(train_set$Str_h_texture)!=pred)
print(error)
fnbtakentime <- fnbendTime - fnbstartTime

print("fastNaiveBayes takes ", fnbtakentime, "seconds")

Algorithms that cannot run in a specific time

neuro network

We can use neuralnet() to train a NN model. Also, the train() function from caret can help us tune parameters. We can plot the result to see which set of parameters is fit our data the best.

tuning parameter

Model <- train(Str_h_texture ~ .,
               data=train_set,
               method="neuralnet",
               ### Parameters for layers
               tuneGrid = expand.grid(.layer1=c(1:2), .layer2=c(0:2), .layer3=c(0)),
               ### Parameters for optmization
               learningrate = 0.01,
               threshold = 0.01,
               stepmax = 5000
)

in nnclassifier y value should be normalized

train_set.norm <- train_set
maxStr_h_texture <- max(train_set.norm$Str_h_texture)
minStr_h_texture <- min(train_set.norm$Str_h_texture)
train_set.norm$Str_h_texture <- normalize(train_set.norm$Str_h_texture)

nnClassifier <- neuralnet(Str_h_texture ~ .,data=train_set.norm, likelihood = TRUE, 
                          hidden = 1,linear.output = F,act.fct = "tanh")
print(nnClassifier$result.matrix)
plot(nnClassifier)

prediction

output<- compute(nnClassifier,train_set[,-1])
p1 <- output$net.result
p1 <- p1 * (maxStr_h_texture-minStr_h_texture)
p1 <- round(p1,0)
nntable<-  table(train_set$Str_h_texture,p1)

Classification with xgBoost

Xgboost can work perfectly in sparse matrix but it unfortunately cannot run in 5 hours

xgb.train = xgb.DMatrix(data = as.matrix(train_set),label =as.matrix(train_set$Str_h_texture))
xgb.test = xgb.DMatrix(data = as.matrix(test_set),label = as.matrix(test_set$Str_h_texture))
validsoilTexture$Str_h_texture <- as.factor(validsoilTexture$Str_h_texture)
num_class = length(levels(validsoilTexture$Str_h_texture))

params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=num_class+1
)

# Train the XGBoost classifer
xgb.fit=xgb.train(
  params=params,
  data=xgb.train,
  nrounds=10000,
  nthreads=1,
  early_stopping_rounds=10,
  watchlist=list(val1=xgb.train,val2=xgb.test),
  verbose=0
)

xgb.fit

Algorithms that cannot run successfully

Random Forest* The algorithm cannot run successfully since it will give an Error: cannot allocate vector of size 16.5 Gb random forest is bad for sparse data which can be found in https://stats.stackexchange.com/questions/28828/is-there-a-random-forest-implementation-that-works-well-with-very-sparse-data

RfClassifier = randomForest(Str_h_texture ~ .,data = train_set,proximity = T,mtry = 10)

rfTable <- table(predict(RfClassifier),train_set$Str_h_texture)

print(RfClassifier)
plot(RfClassifier)
---
title: "machine learning documentation in R"
output: html_notebook
---
```{r}
setwd("C:/Users/horat/Desktop/CSIROIntership/soilCode")


library(dplyr)

#create pivot table 
library(reshape)
library(data.table)

#data partition seperate trainset and testset
library (caTools)

library(caret)

#svm library due to limitation of iterations change the library
library(e1071)
library(LiblineaR)

#random forest
library(randomForest)

#ID4 Decision Tree classifier(CART)
library(rpart)
library(rpart.plot)
library(rattle)

#xgboost
library(xgboost)

#for knn classification
library(class)

#install neuralnetwork
library(neuralnet)

#adabag library
library(adabag)

#Stochastic Gradient Descent (SGD) Method Learning Function
library(gradDescent)
library(lightgbm)
#https://www.kaggle.com/c/amazon-employee-access-challenge/discussion/5128#38925

#matrix library
library(Matrix)

#catboost
library(catboost)

#fast naive bayes
library("fastNaiveBayes")

#tidyverse for easy data manipulation and visualization
#caret for easy machine learning workflow

library(tidyverse)
library(caret)

featureSoilTable <- read.csv(file = "featureTable.csv",stringsAsFactors=FALSE)
```
# Grouping data in a Pivot Table
```{r}
print(head(featureSoilTable))
```
# create the normalize function
```{r}
normalize <- function(x){
  return (as.numeric((x-min(x))/(max(x)-min(x))))
}
```
# preprocessing of the featuring table
```{r}
#change the NULL to na
featureSoilTable['h_texture'][featureSoilTable['h_texture'] == "NULL"] <- NA
#add appendix to colname to avoid mis-understand of the title of dataframe
colnames(featureSoilTable) <- paste("Str",colnames(featureSoilTable),sep = "_")
```
# print out the head of featureSoilTable
```{r}
print(head(featureSoilTable))
```
# remove invalid value and set NA value to 0
```{r}
#extract valid and invalid soil sample
validsoilTexture <- featureSoilTable[!is.na(featureSoilTable$Str_h_texture),]
invalidsoilTexture <- featureSoilTable[is.na(featureSoilTable$Str_h_texture),]

#remove all columns with na
validsoilTexture <- validsoilTexture[,colSums(is.na(validsoilTexture))<nrow(validsoilTexture)]

#change null value to 0
validsoilTexture[is.na(validsoilTexture)] = 0
```
# set x to numeric
```{r}
validsoilTexture$Str_h_texture <- as.numeric(as.factor(validsoilTexture$Str_h_texture))
validsoilTexture <- apply(validsoilTexture, 2, as.factor)
validsoilTexture <- apply(validsoilTexture, 2, as.numeric)
validsoilTexture[,-1]<- (apply(validsoilTexture[,-1],2,normalize))
validsoilTexture <- as.data.frame(validsoilTexture)
ncol <- ncol(validsoilTexture)
```
# print out the head of validsoilTexture
```{r}
print(head(validsoilTexture))
```
# set random seed
```{r}
set.seed(122)
```
# give the valid sample
```{r}
split = sample.split(validsoilTexture$Str_h_texture,SplitRatio = 0.7)

train_set = subset(validsoilTexture, split == TRUE)
test_set = subset(validsoilTexture, split == FALSE)

train_set$Str_h_texture = as.numeric(train_set$Str_h_texture)
test_set$Str_h_texture = as.numeric(test_set$Str_h_texture)
```

```{r}
summary(train_set)
```

```{r}
# Find the best model with the best cost parameter via 10-fold cross-validations

# the tunning part of svm, which will take lots of time to run

tryTypes=c(0:7)
tryCosts=c(1000,1,0.001)
bestCost=NA
bestAcc=0.6290723
bestType=NA

for(ty in tryTypes){

   for(co in tryCosts){
    acc=LiblineaR(data=train_set[,-1],target=train_set[,c("Str_h_texture")],type=7,cost=co,bias=1,verbose=FALSE)
    cat("Results for C=",co," : ",acc," accuracy.\n",sep="")
    if(acc>bestAcc){
    bestCost=co
    bestAcc=acc
    bestType=ty
    }
  }

}

```
# svm classifier
LIBLINEAR is a linear classifier for data with millions of instances and features. It supports L2-regularized classifiers, L2-loss linear SVM, L1-loss linear SVM, and logistic regression (LR).LiblineaR allows the estimation of predictive linear models for classification and regression, such as L1- or L2-regularized logistic regression, L1- or L2-regularized L2-loss support vector classification, L2-regularized L1-loss support vector classification and multi-class support vector classification. It also supports L2-regularized support vector regression (with L1- or L2-loss). The estimation of the models is particularly fast as compared to other libraries. 
```{r}
svmStarttime <- Sys.time()
svmClassifier <- LiblineaR(data = train_set[,-1],target = train_set[,c("Str_h_texture")],bias=1,cost = 1000)
svmPredictTrain <- predict(svmClassifier,train_set[,-1],proba=TRUE,decisionValues=TRUE)
svmPredictTrainTable <- table(svmPredictTrain$predictions,train_set[,c("Str_h_texture")])
svmEndtime <- Sys.time()
svmTimeTaken <- svmEndtime - svmStarttime
svmPredictTest <- predict(svmClassifier,test_set[,-1],proba=TRUE,decisionValues=TRUE)
svmPredictTestTable <- table(svmPredictTest$predictions,test_set[,c("Str_h_texture")])
```
# function for calculating the score of matirx
```{r}
sumElementinTable <- function(a,c,r){
  sum = 0
  for (i in c){
    if (i %in% r){
      sum = sum + a[i,i]
    }
  }
  return(sum)
}

```
# calculating the score of svmClassifier
```{r}

svmTestcol <- colnames(svmPredictTestTable)
svmTestrow <- rownames(svmPredictTestTable)

svmTraincol <- colnames(svmPredictTrainTable)
svmTrainrow <- rownames(svmPredictTrainTable)


svmPredictTestScore <- sumElementinTable(svmPredictTestTable,svmTestcol,svmTestrow)/sum(svmPredictTestTable)
svmPredictTrainScore <- sumElementinTable(svmPredictTrainTable,svmTraincol,svmTrainrow)/sum(svmPredictTrainTable)

```

```{r}
# the time of svm is:
cat("the running time of svm is",svmTimeTaken, "seconds")
```
```{r}
#the score of svm is

cat("The train score of svm algorithm is ",svmPredictTrainScore,'\n')

cat("The test score of svm algorithm is ",svmPredictTestScore)

```

# classification is CART model
```{r}
cartFit <- rpart(Str_h_texture ~ .,data = train_set,control = rpart.control(cp = 0.0001))

#get cp value
printcp(cartFit)
```

choose the CP with lowest xerror

```{r}
cartstartTime <- Sys.time()

fit.pruned = prune(cartFit, cp = 0.00020393)

cartPrediction <- predict(fit.pruned, test_set, type = "vector")

cartendTime <- Sys.time()

cartTimeTaken <- cartendTime - cartendTime

data.frame(test_set,cartPrediction)

cartPrediction = round(cartPrediction,0)
cartTable <- table(test_set$Str_h_texture,cartPrediction)

cartTable
```

calculate the score of cart model
```{r}
cartrow <- rownames(cartTable)
cartcol <- colnames(cartTable)
cartscore <- sumElementinTable(cartTable,cartrow,cartcol)/sum(cartTable)

```

the time of cart model
```{r}
cat("the time of cart",cartTimeTaken , "seconds")
```
the score of cart model

```{r}
cat('the score of cart model',cartscore)

```



# lightgbm

separate x and y from train_set and test_set 
```{r}

train_set.num_X <- select (train_set,-c(Str_h_texture))
test_set.num_X <- select (test_set,-c(Str_h_texture))

```

start lightgbm machine learning algorithms
```{r}
lstarttime <- Sys.time()
ltrain = lgb.Dataset(data = as.matrix(train_set.num_X),label = train_set$Str_h_texture, free_raw_data = FALSE)
params <- list(objective="regression", metric="l2")
model <- lgb.cv(params, 
                ltrain , 
                10, 
                nfold=5, 
                min_data=1, 
                learning_rate=1, 
                early_stopping_rounds=10,
                Depth = 8,
                lambda_l1 = 10,
                lambda_l2 = 10
)
lstoptime <- Sys.time()
```

# tunning parameters

num_leaves: This is the main parameter to control the complexity of the tree model. Theoretically, we can set num_leaves = 2^(max_depth) to obtain the same number of leaves as depth-wise tree. However, this simple conversion is not good in practice. The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting. Thus, when trying to tune the num_leaves, we should let it be smaller than 2^(max_depth). For example, when the max_depth=7 the depth-wise tree can get good accuracy, but setting num_leaves to 127 may cause over-fitting, and setting it to 70 or 80 may get better accuracy than depth-wise.

min_data_in_leaf: This is a very important parameter to prevent over-fitting in a leaf-wise tree. Its optimal value depends on the number of training samples and num_leaves. Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting. In practice, setting it to hundreds or thousands is enough for a large dataset.

max_depth: You also can use max_depth to limit the tree depth explicitly.

```{r}
ltest = lgb.Dataset.create.valid(ltrain , as.matrix(test_set.num_X), label = test_set$Str_h_texture)
valids <- list(test = ltest)

grid_search <- expand.grid(Depth = 1:8,
                           L1 = 8:16,
                           L2 = 8:16)

model <- list()
perf <- numeric(nrow(grid_search))

for (i in 1:nrow(grid_search)) {
  model[[i]] <- lgb.train(list(objective = "regression",
                               metric = "l2",
                               lambda_l1 = grid_search[i, "L1"],
                               lambda_l2 = grid_search[i, "L2"],
                               max_depth = grid_search[i, "Depth"]),
                          ltrain,
                          2,
                          valids,
                          min_data = 1,
                          learning_rate = 1,
                          early_stopping_rounds = 5,
                          num_leaves = 2,
                          num_iterations = 1000,
                          min_gain_to_split = 500,)
  
  perf[i] <- min(rbindlist(model[[i]]$record_evals$test$l2))
}

cat("Model ", which.min(perf), " is lowest loss: ", min(perf), sep = "")

print(grid_search[which.min(perf), ])
```

Algorithms score is around 0.38 and computational time is:
```{r}
lgbtaketime <- lstoptime - lstarttime
cat("The algorithms takes ", lgbtaketime, "seconds")

```

# catboost
```{r}

catstartTime <- Sys.time()

fit_params <- list(l2_leaf_reg = 0.001,
                   depth=6,
                   learning_rate = 0.1,
                   iterations = 100,
                   random_seed = 233)


pool = catboost.load_pool(as.matrix(train_set.num_X), label = as.integer(train_set[,1]))

model <- catboost.train(pool, params = fit_params)

catstopTime <- Sys.time()

cattakenTime <- catstopTime - catstartTime
```

calculate the prediction:
```{r}
#get the prediction
catprediction <- catboost.predict(model, 
                                  pool, 
                                  prediction_type = 'RawFormulaVal')
```

calculate the program score:
```{r}
#round the prediction
catprediction <- round(catprediction,0)

catTable <- table(train_set$Str_h_texture,catprediction)

catTablerow <- rownames(catTable)
catTablecol <- colnames(catTable)
catscore <- sumElementinTable(catTable,catTablerow,catTablecol)/sum(catTable)

```

```{r}
cat('The algorithm takes' ,cattakenTime , 'seconds')
```


```{r}
cat('The algorithm scores' ,catscore)
```
## naivebayes classification*
```{r}

nbstarttime <- Sys.time()
  
nbClassifier <- naiveBayes(as.factor(Str_h_texture) ~ .,data = train_set,laplace=2)
nbTestPrediction <- predict(nbClassifier,test_set,type = "class")
nbTableTest <- table(nbTestPrediction,test_set$Str_h_texture)

nbTestTablerow <- rownames(nbTableTest)
nbTestTablecol <- colnames(nbTableTest)
nbTestTablescore<- sumElementinTable(nbTableTest,nbTestTablerow,nbTestTablecol)/sum(nbTableTest)

nbendtime <- Sys.time()

nbTrainPrediction <- predict(nbClassifier,train_set,type = "class")
nbTrainTable <- table(nbTrainPrediction,train_set$Str_h_texture)

nbTrainTablerow <- rownames(nbTrainTable)
nbTrainTablecol <- colnames(nbTrainTable)
nbTrainTablescore <- sumElementinTable(nbTrainTable,nbTrainTablerow,nbTrainTablecol)/sum(nbTrainTable)

nbtakentime <- nbendtime - nbstarttime

```
# nbalgorithm 
```{r}
cat('NaiveBayes takes',nbtakentime,'seconds')

```

# nbscore
```{r}
cat('NaiveBayes score',nbTrainTablescore)

```


# fastNaiveBayes algorithms by gaussian
```{r}
fnbstartTime <- Sys.time()
dist <- fnb.detect_distribution(train_set.num_X)
gauss <- fnb.gaussian(train_set.num_X[,dist$gaussian], as.factor(train_set$Str_h_texture),sparse = TRUE,check = FALSE)
pred <- predict(gauss, train_set.num_X[,dist$gaussian])
fnbendTime <- Sys.time()
error <- mean(as.factor(train_set$Str_h_texture)!=pred)
print(error)
fnbtakentime <- fnbendTime - fnbstartTime
```

```{r}

print("fastNaiveBayes takes ", fnbtakentime, "seconds")
```
# Algorithms that cannot run in a specific time

# neuro network

We can use neuralnet() to train a NN model. Also, the train() function from caret can help us tune parameters.
We can plot the result to see which set of parameters is fit our data the best.

tuning parameter
```{r}
Model <- train(Str_h_texture ~ .,
               data=train_set,
               method="neuralnet",
               ### Parameters for layers
               tuneGrid = expand.grid(.layer1=c(1:2), .layer2=c(0:2), .layer3=c(0)),
               ### Parameters for optmization
               learningrate = 0.01,
               threshold = 0.01,
               stepmax = 5000
)
```

in nnclassifier y value should be normalized
```{r}
train_set.norm <- train_set
maxStr_h_texture <- max(train_set.norm$Str_h_texture)
minStr_h_texture <- min(train_set.norm$Str_h_texture)
train_set.norm$Str_h_texture <- normalize(train_set.norm$Str_h_texture)

nnClassifier <- neuralnet(Str_h_texture ~ .,data=train_set.norm, likelihood = TRUE, 
                          hidden = 1,linear.output = F,act.fct = "tanh")
print(nnClassifier$result.matrix)
plot(nnClassifier)
```

prediction
```{r}
output<- compute(nnClassifier,train_set[,-1])
p1 <- output$net.result
p1 <- p1 * (maxStr_h_texture-minStr_h_texture)
p1 <- round(p1,0)
nntable<-  table(train_set$Str_h_texture,p1)

```

# Classification with xgBoost
Xgboost can work perfectly in sparse matrix but it unfortunately cannot run in 5 hours

```{r}
xgb.train = xgb.DMatrix(data = as.matrix(train_set),label =as.matrix(train_set$Str_h_texture))
xgb.test = xgb.DMatrix(data = as.matrix(test_set),label = as.matrix(test_set$Str_h_texture))
validsoilTexture$Str_h_texture <- as.factor(validsoilTexture$Str_h_texture)
num_class = length(levels(validsoilTexture$Str_h_texture))

params = list(
  booster="gbtree",
  eta=0.001,
  max_depth=5,
  gamma=3,
  subsample=0.75,
  colsample_bytree=1,
  objective="multi:softprob",
  eval_metric="mlogloss",
  num_class=num_class+1
)

# Train the XGBoost classifer
xgb.fit=xgb.train(
  params=params,
  data=xgb.train,
  nrounds=10000,
  nthreads=1,
  early_stopping_rounds=10,
  watchlist=list(val1=xgb.train,val2=xgb.test),
  verbose=0
)

xgb.fit

```

# Algorithms that cannot run successfully 
Random Forest* The algorithm cannot run successfully since it will give an Error: cannot allocate vector of size 16.5 Gb
random forest is bad for sparse data which can be found in https://stats.stackexchange.com/questions/28828/is-there-a-random-forest-implementation-that-works-well-with-very-sparse-data
```{r}
RfClassifier = randomForest(Str_h_texture ~ .,data = train_set,proximity = T,mtry = 10)

rfTable <- table(predict(RfClassifier),train_set$Str_h_texture)

print(RfClassifier)
plot(RfClassifier)
```



